{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "3e0d49cc",
   "metadata": {},
   "source": [
    "# Find Duplicates\n",
    "The following example shows how to identify entities that are likely to be duplicates of one through the __ampligraph.discovery.find_duplicates__ API."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f7d979c1",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = '0'\n",
    "os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'\n",
    "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'\n",
    "import tensorflow as tf\n",
    "tf.get_logger().setLevel('ERROR')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "dafa4c11",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import re\n",
    "import numpy as np\n",
    "\n",
    "# The IMDB dataset used here is part of the Movies5 dataset found on:\n",
    "# The Magellan Data Repository (https://sites.google.com/site/anhaidgroup/projects/data)\n",
    "import requests\n",
    "url = 'http://pages.cs.wisc.edu/~anhai/data/784_data/movies5.tar.gz'\n",
    "open('movies5.tar.gz', 'wb').write(requests.get(url).content)\n",
    "import tarfile\n",
    "tar = tarfile.open('movies5.tar.gz', \"r:gz\")\n",
    "tar.extractall()\n",
    "tar.close()\n",
    "\n",
    "# Reading tabular dataset of IMDB movies and filling the missing values\n",
    "imdb = pd.read_csv(\"movies5/csv_files/imdb.csv\")\n",
    "imdb[\"directors\"] = imdb[\"directors\"].fillna(\"UnknownDirector\")\n",
    "imdb[\"actors\"] = imdb[\"actors\"].fillna(\"UnknownActor\")\n",
    "imdb[\"genre\"] = imdb[\"genre\"].fillna(\"UnknownGenre\")\n",
    "imdb[\"duration\"] = imdb[\"duration\"].fillna(\"0\")\n",
    "\n",
    "# Creating knowledge graph triples from tabular dataset\n",
    "imdb_triples = []\n",
    "\n",
    "for _, row in imdb.iterrows():\n",
    "    movie_id = \"ID\" + str(row[\"id\"])\n",
    "    directors = row[\"directors\"].split(\",\")\n",
    "    actors = row[\"actors\"].split(\",\")\n",
    "    genres = row[\"genre\"].split(\",\")\n",
    "    duration = \"Duration\" + str(int(re.sub(\"\\D\", \"\", row[\"duration\"])) // 30)\n",
    "\n",
    "    directors_triples = [(movie_id, \"hasDirector\", d) for d in directors]\n",
    "    actors_triples = [(movie_id, \"hasActor\", a) for a in actors]\n",
    "    genres_triples = [(movie_id, \"hasGenre\", g) for g in genres]\n",
    "    duration_triple = (movie_id, \"hasDuration\", duration)\n",
    "\n",
    "    imdb_triples.extend(directors_triples)\n",
    "    imdb_triples.extend(actors_triples)\n",
    "    imdb_triples.extend(genres_triples)\n",
    "    imdb_triples.append(duration_triple)\n",
    "    \n",
    "imdb_triples = np.array(imdb_triples)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "87ed1243",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Metal device set to: Apple M1 Pro\n",
      "\n",
      "systemMemory: 32.00 GB\n",
      "maxCacheSize: 10.67 GB\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<tensorflow.python.keras.callbacks.History at 0x16b2e90c0>"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from ampligraph.latent_features import ScoringBasedEmbeddingModel\n",
    "\n",
    "# Create, compile and fit the model\n",
    "model = ScoringBasedEmbeddingModel(eta=5, \n",
    "                                   k=300,\n",
    "                                   scoring_type='ComplEx')\n",
    "\n",
    "\n",
    "\n",
    "model.compile(optimizer='adam', \n",
    "              loss='multiclass_nll')\n",
    "\n",
    "\n",
    "model.fit(imdb_triples,\n",
    "          batch_size=10000,\n",
    "          epochs=50, \n",
    "          verbose=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a3cc5988",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              movie_name  year\n",
      "2115           White Air  2007\n",
      "2116           White Air  2007\n",
      "2992  Chor Machaaye Shor  2002\n",
      "2991  Chor Machaaye Shor  2002\n",
      "5856    The Golden Lotus  1974\n",
      "5857    The Golden Lotus  1974\n"
     ]
    }
   ],
   "source": [
    "# Finding duplicates movies (entities)\n",
    "from ampligraph.discovery import find_duplicates\n",
    "\n",
    "# get the unique movies - in this case all subject entities are movies\n",
    "entities = np.unique(imdb_triples[:, 0])\n",
    "# find duplicate movies\n",
    "dups, _ = find_duplicates(entities, model, mode='e', tolerance=0.45)\n",
    "id_list = []\n",
    "for data in dups:\n",
    "    for i in data:\n",
    "        id_list.append(int(i[2:]))\n",
    "print(imdb.iloc[id_list[:6]][['movie_name', 'year']])\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.6 ('base')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  },
  "vscode": {
   "interpreter": {
    "hash": "2e69f3670cdad0193847aaa0b77be56c05c951fcbdd384ff882dde0464f4de76"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
